#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* conv_2d_int_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
"#define MOD_NUM 15\n"
"#ifdef INPUT_CHANNEL_BOUNDARY_PROTECT\n"
" #define PADZEROSVEC(k, channel, data0, data1, data2, data3) "" data0 = (k << 2) < channel ? data0 : 0; "" data1 = (k << 2) + 1 < channel ? data1 : 0; "" data2 = (k << 2) + 2 < channel ? data2 : 0; "" data3=(k << 2)+3<channel ? data3 : 0;\n"
"#else\n"
" #define PADZEROSVEC(k,channel,data0,data1,data2,data3)\n"
"#endif\n"
"__kernel\n"
"void conv_2d_int_c4h1w1(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
"#if QUANT_BIT == 8\n"
" __global const char *weight,\n"
"#else\n"
" __global const uchar *weight,\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_hw.y;\n"
" const int out_w_idx=out_c_w_idx % out_hw.y;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" \n"
" COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
" const int in_w_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kw_start=select(0,(-in_w_idx_base+dilate_hw.y-1)/dilate_hw.y,in_w_idx_base<0);\n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_w_idx_start=mad24(kw_start,dilate_hw.y,in_w_idx_base);\n"
" const int in_w_idx_end=min(mad24(filter_hw.y,dilate_hw.y,in_w_idx_base),in_hw.y);\n"
" \n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(out_c_idx,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*8))/coef);\n"
" COMPUTE_FLOAT4 scale=(COMPUTE_FLOAT4)(ScaleOffset.s0,ScaleOffset.s2,ScaleOffset.s4,ScaleOffset.s6);\n"
" COMPUTE_FLOAT4 offset=(COMPUTE_FLOAT4)(ScaleOffset.s1,ScaleOffset.s3,ScaleOffset.s5,ScaleOffset.s7);\n"
" #else\n"
" COMPUTE_FLOAT4 scale=CONVERT_COMPUTE_FLOAT4(convert_float4(vload4(out_c_idx,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*4))/coef);\n"
" COMPUTE_FLOAT4 offset=0;\n"
" #endif\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+kw_start)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" for(int ix=in_w_idx_start; ix<in_w_idx_end; ix += dilate_hw.y) {\n"
" int inp_offset=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+ix)*4;\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
" \n"
" const int filter_w_inc=(ix-in_w_idx_start)/dilate_hw.y;\n"
"#if QUANT_BIT == 8\n"
" char4 charWeight0=vload4(filter_w_inc,weight+weight_offset);\n"
" char4 charWeight1=vload4(filter_w_inc,weight+weight_offset+weight_oc_offset);\n"
" char4 charWeight2=vload4(filter_w_inc,weight+weight_offset+weight_oc_offset*2);\n"
" char4 charWeight3=vload4(filter_w_inc,weight+weight_offset+weight_oc_offset*3);\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale+offset;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale+offset;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale+offset;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale+offset;\n"
"#else\n"
" uchar2 charWeightInt40=vload2(filter_w_inc,weight+weight_offset/2);\n"
" uchar2 charWeightInt41=vload2(filter_w_inc,weight+weight_offset/2+weight_oc_offset/2);\n"
" uchar2 charWeightInt42=vload2(filter_w_inc,weight+weight_offset/2+weight_oc_offset*2/2);\n"
" uchar2 charWeightInt43=vload2(filter_w_inc,weight+weight_offset/2+weight_oc_offset*3/2);\n"
" char4 charWeight0=(char4)(0,0,0,0);\n"
" char4 charWeight1=(char4)(0,0,0,0);\n"
" char4 charWeight2=(char4)(0,0,0,0);\n"
" char4 charWeight3=(char4)(0,0,0,0);\n"
" charWeight0.x=(charWeightInt40.s0 >> 4)-8;\n"
" charWeight0.y=(charWeightInt40.s0 & MOD_NUM)-8;\n"
" charWeight0.z=(charWeightInt40.s1 >> 4)-8;\n"
" charWeight0.w=(charWeightInt40.s1 & MOD_NUM)-8;\n"
" charWeight1.x=(charWeightInt41.s0 >> 4)-8;\n"
" charWeight1.y=(charWeightInt41.s0 & MOD_NUM)-8;\n"
" charWeight1.z=(charWeightInt41.s1 >> 4)-8;\n"
" charWeight1.w=(charWeightInt41.s1 & MOD_NUM)-8;\n"
" charWeight2.x=(charWeightInt42.s0 >> 4)-8;\n"
" charWeight2.y=(charWeightInt42.s0 & MOD_NUM)-8;\n"
" charWeight2.z=(charWeightInt42.s1 >> 4)-8;\n"
" charWeight2.w=(charWeightInt42.s1 & MOD_NUM)-8;\n"
" charWeight3.x=(charWeightInt43.s0 >> 4)-8;\n"
" charWeight3.y=(charWeightInt43.s0 & MOD_NUM)-8;\n"
" charWeight3.z=(charWeightInt43.s1 >> 4)-8;\n"
" charWeight3.w=(charWeightInt43.s1 & MOD_NUM)-8;\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale+offset;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale+offset;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale+offset;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale+offset;\n"
"#endif\n"
" PADZEROSVEC(in_c_idx,inChannel,weight0,weight1,weight2,weight3);\n"
" \n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" }\n"
" weight_offset += 4*filter_hw.y;\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" \n"
"}\n"
"__kernel\n"
"void conv_2d_int_c4h1w2(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
"#if QUANT_BIT == 8\n"
" __global const char *weight,\n"
"#else\n"
" __global const uchar *weight,\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,//generate width's num\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
" const int out_w_idx=(out_c_w_idx % out_w_blocks) << 1;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" \n"
" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
" COMPUTE_FLOAT4 out0=bias0;\n"
" COMPUTE_FLOAT4 out1=bias0;\n"
" \n"
" const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(out_c_idx,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*8))/coef);\n"
" COMPUTE_FLOAT4 scale=(COMPUTE_FLOAT4)(ScaleOffset.s0,ScaleOffset.s2,ScaleOffset.s4,ScaleOffset.s6);\n"
" COMPUTE_FLOAT4 offset=(COMPUTE_FLOAT4)(ScaleOffset.s1,ScaleOffset.s3,ScaleOffset.s5,ScaleOffset.s7);\n"
" #else\n"
" COMPUTE_FLOAT4 scale=CONVERT_COMPUTE_FLOAT4(convert_float4(vload4(out_c_idx,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*4))/coef);\n"
" COMPUTE_FLOAT4 offset=0;\n"
" #endif\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
" for(int fw=0; fw<filter_hw.y; fw++) {\n"
" const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
" const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4((in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w0_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4((in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w1_idx,input+inp_offset_base));\n"
" \n"
"#if QUANT_BIT == 8\n"
" char4 charWeight0=vload4(0,weight+weight_offset);\n"
" char4 charWeight1=vload4(0,weight+weight_offset+weight_oc_offset);\n"
" char4 charWeight2=vload4(0,weight+weight_offset+weight_oc_offset*2);\n"
" char4 charWeight3=vload4(0,weight+weight_offset+weight_oc_offset*3);\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale+offset;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale+offset;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale+offset;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale+offset;\n"
"#else\n"
" uchar2 charWeightInt40=vload2(0,weight+weight_offset/2);\n"
" uchar2 charWeightInt41=vload2(0,weight+weight_offset/2+weight_oc_offset/2);\n"
" uchar2 charWeightInt42=vload2(0,weight+weight_offset/2+weight_oc_offset*2/2);\n"
" uchar2 charWeightInt43=vload2(0,weight+weight_offset/2+weight_oc_offset*3/2);\n"
" char4 charWeight0=(char4)(0,0,0,0);\n"
" char4 charWeight1=(char4)(0,0,0,0);\n"
" char4 charWeight2=(char4)(0,0,0,0);\n"
" char4 charWeight3=(char4)(0,0,0,0);\n"
" charWeight0.x=(charWeightInt40.s0 >> 4)-8;\n"
" charWeight0.y=(charWeightInt40.s0 & MOD_NUM)-8;\n"
" charWeight0.z=(charWeightInt40.s1 >> 4)-8;\n"
" charWeight0.w=(charWeightInt40.s1 & MOD_NUM)-8;\n"
" charWeight1.x=(charWeightInt41.s0 >> 4)-8;\n"
" charWeight1.y=(charWeightInt41.s0 & MOD_NUM)-8;\n"
" charWeight1.z=(charWeightInt41.s1 >> 4)-8;\n"
" charWeight1.w=(charWeightInt41.s1 & MOD_NUM)-8;\n"
" charWeight2.x=(charWeightInt42.s0 >> 4)-8;\n"
" charWeight2.y=(charWeightInt42.s0 & MOD_NUM)-8;\n"
" charWeight2.z=(charWeightInt42.s1 >> 4)-8;\n"
" charWeight2.w=(charWeightInt42.s1 & MOD_NUM)-8;\n"
" charWeight3.x=(charWeightInt43.s0 >> 4)-8;\n"
" charWeight3.y=(charWeightInt43.s0 & MOD_NUM)-8;\n"
" charWeight3.z=(charWeightInt43.s1 >> 4)-8;\n"
" charWeight3.w=(charWeightInt43.s1 & MOD_NUM)-8;\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale+offset;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale+offset;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale+offset;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale+offset;\n"
"#endif\n"
" PADZEROSVEC(in_c_idx,inChannel,weight0,weight1,weight2,weight3);\n"
" \n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" if(out_w_idx+1 >= out_hw.y) return;\n"
" vstore4(CONVERT_FLOAT4(out1),1,output+out_offset);\n"
"#else\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_int_c4h1w4(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
"#if QUANT_BIT == 8\n"
" __global const char *weight,\n"
"#else\n"
" __global const uchar *weight,\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
" const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
" COMPUTE_FLOAT4 out0=bias0;\n"
" COMPUTE_FLOAT4 out1=bias0;\n"
" COMPUTE_FLOAT4 out2=bias0;\n"
" COMPUTE_FLOAT4 out3=bias0;\n"
" const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n"
" const int in_w2_idx_base=in_w1_idx_base+stride_hw.y;\n"
" const int in_w3_idx_base=in_w2_idx_base+stride_hw.y;\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=out_c_blocks*filter_hw.x*filter_hw.y*4;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(out_c_idx,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*8))/coef);\n"
" COMPUTE_FLOAT4 scale=(COMPUTE_FLOAT4)(ScaleOffset.s0,ScaleOffset.s2,ScaleOffset.s4,ScaleOffset.s6);\n"
" COMPUTE_FLOAT4 offset=(COMPUTE_FLOAT4)(ScaleOffset.s1,ScaleOffset.s3,ScaleOffset.s5,ScaleOffset.s7);\n"
" #else\n"
" COMPUTE_FLOAT4 scale=CONVERT_COMPUTE_FLOAT4(convert_float4(vload4(out_c_idx,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*4))/coef);\n"
" COMPUTE_FLOAT4 offset=0;\n"
" #endif\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
" for(int fw=0; fw<filter_hw.y; fw++) {\n"
" const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
" const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
" const int in_w2_idx=fw*dilate_hw.y+in_w2_idx_base;\n"
" const int in_w3_idx=fw*dilate_hw.y+in_w3_idx_base;\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4((in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w0_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4((in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w1_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4((in_w2_idx<0 || in_w2_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w2_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4((in_w3_idx<0 || in_w3_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w3_idx,input+inp_offset_base));\n"
"#if QUANT_BIT == 8\n"
" char4 charWeight0=vload4(0,weight+weight_offset);\n"
" char4 charWeight1=vload4(0,weight+weight_offset+weight_oc_offset);\n"
" char4 charWeight2=vload4(0,weight+weight_offset+weight_oc_offset*2);\n"
" char4 charWeight3=vload4(0,weight+weight_offset+weight_oc_offset*3);\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale+offset;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale+offset;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale+offset;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale+offset;\n"
"#else\n"
" uchar2 charWeightInt40=vload2(0,weight+weight_offset/2);\n"
" uchar2 charWeightInt41=vload2(0,weight+weight_offset/2+weight_oc_offset/2);\n"
" uchar2 charWeightInt42=vload2(0,weight+weight_offset/2+weight_oc_offset*2/2);\n"
" uchar2 charWeightInt43=vload2(0,weight+weight_offset/2+weight_oc_offset*3/2);\n"
" char4 charWeight0=(char4)(0,0,0,0);\n"
" char4 charWeight1=(char4)(0,0,0,0);\n"
" char4 charWeight2=(char4)(0,0,0,0);\n"
" char4 charWeight3=(char4)(0,0,0,0);\n"
" charWeight0.x=(charWeightInt40.s0 >> 4)-8;\n"
" charWeight0.y=(charWeightInt40.s0 & MOD_NUM)-8;\n"
" charWeight0.z=(charWeightInt40.s1 >> 4)-8;\n"
" charWeight0.w=(charWeightInt40.s1 & MOD_NUM)-8;\n"
" charWeight1.x=(charWeightInt41.s0 >> 4)-8;\n"
" charWeight1.y=(charWeightInt41.s0 & MOD_NUM)-8;\n"
" charWeight1.z=(charWeightInt41.s1 >> 4)-8;\n"
" charWeight1.w=(charWeightInt41.s1 & MOD_NUM)-8;\n"
" charWeight2.x=(charWeightInt42.s0 >> 4)-8;\n"
" charWeight2.y=(charWeightInt42.s0 & MOD_NUM)-8;\n"
" charWeight2.z=(charWeightInt42.s1 >> 4)-8;\n"
" charWeight2.w=(charWeightInt42.s1 & MOD_NUM)-8;\n"
" charWeight3.x=(charWeightInt43.s0 >> 4)-8;\n"
" charWeight3.y=(charWeightInt43.s0 & MOD_NUM)-8;\n"
" charWeight3.z=(charWeightInt43.s1 >> 4)-8;\n"
" charWeight3.w=(charWeightInt43.s1 & MOD_NUM)-8;\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale+offset;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale+offset;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale+offset;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale+offset;\n"
"#endif\n"
" PADZEROSVEC(in_c_idx,inChannel,weight0,weight1,weight2,weight3);\n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" out2=mad(in2.x,weight0,out2);\n"
" out2=mad(in2.y,weight1,out2);\n"
" out2=mad(in2.z,weight2,out2);\n"
" out2=mad(in2.w,weight3,out2);\n"
" \n"
" out3=mad(in3.x,weight0,out3);\n"
" out3=mad(in3.y,weight1,out3);\n"
" out3=mad(in3.z,weight2,out3);\n"
" out3=mad(in3.w,weight3,out3);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
" \n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_hw.y-out_w_idx;\n"
" if (remain >= 4) {\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
"#endif\n"
"}\n"
"__kernel\n"
"void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS\n"
" __global const FLOAT *input,\n"
"#if QUANT_BIT == 8\n"
" __global const char *weight,\n"
"#else\n"
" __global const uchar *weight,\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT *output,\n"
" __private const int2 in_hw,\n"
" __private const int inChannel,\n"
" __private const int in_c_blocks,\n"
" __private const int batch,\n"
" __private const int2 out_hw,\n"
" __private const int2 filter_hw,\n"
" __private const int2 stride_hw,\n"
" __private const int2 pad_hw,\n"
" __private const int2 dilate_hw,\n"
" __private const int out_w_blocks,\n"
" __private const int out_c_blocks,\n"
" __private const int out_h_blocks,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
" const int out_b_h_idx=get_global_id(1); //b h\n"
" DEAL_NON_UNIFORM_DIM2(out_c_w_idx,out_b_h_idx);\n"
" const int out_c_idx_0=(out_c_w_idx/out_w_blocks) << 1;\n"
" const int out_c_idx_1=out_c_idx_0+1;\n"
" const int out_w_idx=(out_c_w_idx % out_w_blocks) << 2;\n"
" const int out_b_idx=out_b_h_idx/out_hw.x;//equal to in_b_idx\n"
" const int out_h_idx=out_b_h_idx % out_hw.x;\n"
" \n"
" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_0,bias));\n"
" COMPUTE_FLOAT4 out0=bias0;\n"
" COMPUTE_FLOAT4 out1=bias0;\n"
" COMPUTE_FLOAT4 out2=bias0;\n"
" COMPUTE_FLOAT4 out3=bias0;\n"
" // bias align to 8,no need boundry protect\n"
" COMPUTE_FLOAT4 bias1=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx_1,bias));\n"
" COMPUTE_FLOAT4 out4=bias1;\n"
" COMPUTE_FLOAT4 out5=bias1;\n"
" COMPUTE_FLOAT4 out6=bias1;\n"
" COMPUTE_FLOAT4 out7=bias1;\n"
" const int in_w0_idx_base=mad24(out_w_idx,stride_hw.y,-pad_hw.y);\n"
" const int in_w1_idx_base=in_w0_idx_base+stride_hw.y;\n"
" const int in_w2_idx_base=in_w1_idx_base+stride_hw.y;\n"
" const int in_w3_idx_base=in_w2_idx_base+stride_hw.y;\n"
" const int in_h_idx_base=mad24(out_h_idx,stride_hw.x,-pad_hw.x);\n"
" \n"
" const int kh_start=select(0,(-in_h_idx_base+dilate_hw.x-1)/dilate_hw.x,in_h_idx_base<0);\n"
" const int in_h_idx_start=mad24(kh_start,dilate_hw.x,in_h_idx_base);\n"
" const int in_h_idx_end=min(mad24(filter_hw.x,dilate_hw.x,in_h_idx_base),in_hw.x);\n"
" \n"
" const int weight_oc_offset=filter_hw.x*filter_hw.y*4;\n"
" const int weight_ic_offset=out_c_blocks*weight_oc_offset;\n"
" for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 ScaleOffset0=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(out_c_idx_0,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*8))/coef);\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" COMPUTE_FLOAT8 ScaleOffset1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT8)0 : CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(out_c_idx_1,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*8))/coef);\n"
" #else\n"
" COMPUTE_FLOAT8 ScaleOffset1=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(out_c_idx_1,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*8))/coef);\n"
" #endif\n"
" COMPUTE_FLOAT4 scale0=(COMPUTE_FLOAT4)(ScaleOffset0.s0,ScaleOffset0.s2,ScaleOffset0.s4,ScaleOffset0.s6);\n"
" COMPUTE_FLOAT4 offset0=(COMPUTE_FLOAT4)(ScaleOffset0.s1,ScaleOffset0.s3,ScaleOffset0.s5,ScaleOffset0.s7);\n"
" COMPUTE_FLOAT4 scale1=(COMPUTE_FLOAT4)(ScaleOffset1.s0,ScaleOffset1.s2,ScaleOffset1.s4,ScaleOffset1.s6);\n"
" COMPUTE_FLOAT4 offset1=(COMPUTE_FLOAT4)(ScaleOffset1.s1,ScaleOffset1.s3,ScaleOffset1.s5,ScaleOffset1.s7);\n"
" #else\n"
" COMPUTE_FLOAT4 scale0=CONVERT_COMPUTE_FLOAT4(convert_float4(vload4(out_c_idx_0,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*4))/coef);\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" COMPUTE_FLOAT4 scale1=out_c_idx_1 >= out_c_blocks ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(convert_float4(vload4(out_c_idx_1,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*4))/coef);\n"
" #else\n"
" COMPUTE_FLOAT4 scale1=CONVERT_COMPUTE_FLOAT4(convert_float4(vload4(out_c_idx_1,dequantScaleOffset+(in_c_idx*4)/blockDim*out_c_blocks*4))/coef);\n"
" #endif\n"
" COMPUTE_FLOAT4 offset0=0,offset1=0;\n"
" #endif\n"
" //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
" //index: [0,4*in_c_idx,out_c_idx_0*kh*kw+kh_start*kw+kw_start,0]\n"
" int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx_0) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
" for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
" for(int fw=0; fw<filter_hw.y; fw++) {\n"
" const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
" const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
" const int in_w2_idx=fw*dilate_hw.y+in_w2_idx_base;\n"
" const int in_w3_idx=fw*dilate_hw.y+in_w3_idx_base;\n"
" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4((in_w0_idx<0 || in_w0_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w0_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4((in_w1_idx<0 || in_w1_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w1_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4((in_w2_idx<0 || in_w2_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w2_idx,input+inp_offset_base));\n"
" COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4((in_w3_idx<0 || in_w3_idx >= in_hw.y) ? (FLOAT4)0 : vload4(in_w3_idx,input+inp_offset_base));\n"
"#if QUANT_BIT == 8\n"
" char4 charWeight0=vload4(0,weight+weight_offset);\n"
" char4 charWeight1=vload4(0,weight+weight_offset+weight_ic_offset);\n"
" char4 charWeight2=vload4(0,weight+weight_offset+weight_ic_offset*2);\n"
" char4 charWeight3=vload4(0,weight+weight_offset+weight_ic_offset*3);\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale0+offset0;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale0+offset0;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale0+offset0;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale0+offset0;\n"
"#else\n"
" uchar2 charWeightInt40=vload2(0,weight+weight_offset/2);\n"
" uchar2 charWeightInt41=vload2(0,weight+weight_offset/2+weight_ic_offset/2);\n"
" uchar2 charWeightInt42=vload2(0,weight+weight_offset/2+weight_ic_offset*2/2);\n"
" uchar2 charWeightInt43=vload2(0,weight+weight_offset/2+weight_ic_offset*3/2);\n"
" char4 charWeight0=(char4)(0,0,0,0);\n"
" char4 charWeight1=(char4)(0,0,0,0);\n"
" char4 charWeight2=(char4)(0,0,0,0);\n"
" char4 charWeight3=(char4)(0,0,0,0);\n"
" charWeight0.x=(charWeightInt40.s0 >> 4)-8;\n"
" charWeight0.y=(charWeightInt40.s0 & MOD_NUM)-8;\n"
" charWeight0.z=(charWeightInt40.s1 >> 4)-8;\n"
" charWeight0.w=(charWeightInt40.s1 & MOD_NUM)-8;\n"
" charWeight1.x=(charWeightInt41.s0 >> 4)-8;\n"
" charWeight1.y=(charWeightInt41.s0 & MOD_NUM)-8;\n"
" charWeight1.z=(charWeightInt41.s1 >> 4)-8;\n"
" charWeight1.w=(charWeightInt41.s1 & MOD_NUM)-8;\n"
" charWeight2.x=(charWeightInt42.s0 >> 4)-8;\n"
" charWeight2.y=(charWeightInt42.s0 & MOD_NUM)-8;\n"
" charWeight2.z=(charWeightInt42.s1 >> 4)-8;\n"
" charWeight2.w=(charWeightInt42.s1 & MOD_NUM)-8;\n"
" charWeight3.x=(charWeightInt43.s0 >> 4)-8;\n"
" charWeight3.y=(charWeightInt43.s0 & MOD_NUM)-8;\n"
" charWeight3.z=(charWeightInt43.s1 >> 4)-8;\n"
" charWeight3.w=(charWeightInt43.s1 & MOD_NUM)-8;\n"
" COMPUTE_FLOAT4 weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale0+offset0;\n"
" COMPUTE_FLOAT4 weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale0+offset0;\n"
" COMPUTE_FLOAT4 weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale0+offset0;\n"
" COMPUTE_FLOAT4 weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale0+offset0;\n"
"#endif\n"
" PADZEROSVEC(in_c_idx,inChannel,weight0,weight1,weight2,weight3);\n"
" out0=mad(in0.x,weight0,out0);\n"
" out0=mad(in0.y,weight1,out0);\n"
" out0=mad(in0.z,weight2,out0);\n"
" out0=mad(in0.w,weight3,out0);\n"
" \n"
" out1=mad(in1.x,weight0,out1);\n"
" out1=mad(in1.y,weight1,out1);\n"
" out1=mad(in1.z,weight2,out1);\n"
" out1=mad(in1.w,weight3,out1);\n"
" \n"
" out2=mad(in2.x,weight0,out2);\n"
" out2=mad(in2.y,weight1,out2);\n"
" out2=mad(in2.z,weight2,out2);\n"
" out2=mad(in2.w,weight3,out2);\n"
" \n"
" out3=mad(in3.x,weight0,out3);\n"
" out3=mad(in3.y,weight1,out3);\n"
" out3=mad(in3.z,weight2,out3);\n"
" out3=mad(in3.w,weight3,out3);\n"
" \n"
"#if QUANT_BIT == 8\n"
" #ifdef CHANNEL_BOUNDARY_PROTECT\n"
" charWeight0=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset);\n"
" charWeight1=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
" charWeight2=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
" charWeight3=out_c_idx_1 >= out_c_blocks ? (char4)0 : vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
" #else\n"
" charWeight0=vload4(0,weight+weight_offset+weight_oc_offset);\n"
" charWeight1=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset);\n"
" charWeight2=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*2);\n"
" charWeight3=vload4(0,weight+weight_offset+weight_oc_offset+weight_ic_offset*3);\n"
" #endif\n"
" weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale1+offset1;\n"
" weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale1+offset1;\n"
" weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale1+offset1;\n"
" weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale1+offset1;\n"
"#else\n"
" charWeightInt40=vload2(0,weight+weight_offset/2+weight_oc_offset/2);\n"
" charWeightInt41=vload2(0,weight+weight_offset/2+weight_oc_offset/2+weight_ic_offset/2);\n"
" charWeightInt42=vload2(0,weight+weight_offset/2+weight_oc_offset/2+weight_ic_offset*2/2);\n"
" charWeightInt43=vload2(0,weight+weight_offset/2+weight_oc_offset/2+weight_ic_offset*3/2);\n"
" charWeight0=(char4)(0,0,0,0);\n"
" charWeight1=(char4)(0,0,0,0);\n"
" charWeight2=(char4)(0,0,0,0);\n"
" charWeight3=(char4)(0,0,0,0);\n"
" charWeight0.x=(charWeightInt40.s0 >> 4)-8;\n"
" charWeight0.y=(charWeightInt40.s0 & MOD_NUM)-8;\n"
" charWeight0.z=(charWeightInt40.s1 >> 4)-8;\n"
" charWeight0.w=(charWeightInt40.s1 & MOD_NUM)-8;\n"
" charWeight1.x=(charWeightInt41.s0 >> 4)-8;\n"
" charWeight1.y=(charWeightInt41.s0 & MOD_NUM)- 8;\n"
" charWeight1.z=(charWeightInt41.s1 >> 4)-8;\n"
" charWeight1.w=(charWeightInt41.s1 & MOD_NUM)-8;\n"
" charWeight2.x=(charWeightInt42.s0 >> 4)-8;\n"
" charWeight2.y=(charWeightInt42.s0 & MOD_NUM)-8;\n"
" charWeight2.z=(charWeightInt42.s1 >> 4)-8;\n"
" charWeight2.w=(charWeightInt42.s1 & MOD_NUM)-8;\n"
" charWeight3.x=(charWeightInt43.s0 >> 4)-8;\n"
" charWeight3.y=(charWeightInt43.s0 & MOD_NUM)-8;\n"
" charWeight3.z=(charWeightInt43.s1 >> 4)-8;\n"
" charWeight3.w=(charWeightInt43.s1 & MOD_NUM)-8;\n"
" weight0=CONVERT_COMPUTE_FLOAT4(charWeight0)*scale1+offset1;\n"
" weight1=CONVERT_COMPUTE_FLOAT4(charWeight1)*scale1+offset1;\n"
" weight2=CONVERT_COMPUTE_FLOAT4(charWeight2)*scale1+offset1;\n"
" weight3=CONVERT_COMPUTE_FLOAT4(charWeight3)*scale1+offset1;\n"
"#endif\n"
" PADZEROSVEC(in_c_idx,inChannel,weight0,weight1,weight2,weight3);\n"
" \n"
" out4=mad(in0.x,weight0,out4);\n"
" out4=mad(in0.y,weight1,out4);\n"
" out4=mad(in0.z,weight2,out4);\n"
" out4=mad(in0.w,weight3,out4);\n"
" \n"
" out5=mad(in1.x,weight0,out5);\n"
" out5=mad(in1.y,weight1,out5);\n"
" out5=mad(in1.z,weight2,out5);\n"
" out5=mad(in1.w,weight3,out5);\n"
" \n"
" out6=mad(in2.x,weight0,out6);\n"
" out6=mad(in2.y,weight1,out6);\n"
" out6=mad(in2.z,weight2,out6);\n"
" out6=mad(in2.w,weight3,out6);\n"
" \n"
" out7=mad(in3.x,weight0,out7);\n"
" out7=mad(in3.y,weight1,out7);\n"
" out7=mad(in3.z,weight2,out7);\n"
" out7=mad(in3.w,weight3,out7);\n"
" \n"
" weight_offset += 4;\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
" out4=fmax(out4,(COMPUTE_FLOAT4)0);\n"
" out5=fmax(out5,(COMPUTE_FLOAT4)0);\n"
" out6=fmax(out6,(COMPUTE_FLOAT4)0);\n"
" out7=fmax(out7,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
" out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" int out_offset=(((out_b_idx+out_c_idx_0*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
"#ifdef BLOCK_LEAVE\n"
" const int remain=out_hw.y-out_w_idx;\n"
" if(remain >= 4){\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out2),2,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out0,out1)),0,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
" }\n"
"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks)return;\n"
"#endif\n"
" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" if(remain >= 4){\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
" }else if(remain == 3){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,output+out_offset);\n"
" vstore4(CONVERT_FLOAT4(out6),2,output+out_offset);\n"
" }else if(remain == 2){\n"
" vstore8(CONVERT_FLOAT8((COMPUTE_FLOAT8)(out4,out5)),0,output+out_offset);\n"
" }else if(remain == 1){\n"
" vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
" }\n"
"#else\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
"#ifdef CHANNEL_BOUNDARY_PROTECT\n"
" if(out_c_idx_1 >= out_c_blocks)return;\n"
"#endif\n"
" out_offset=(((out_b_idx+out_c_idx_1*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
"#endif\n"
"}\n"
;
#endif
}
